In this example we will use the IPUMS USA data to produce survey-based estimates for various geographic levels present in the IPUMS. This example uses the 2014-2018 ACS 5-year microdata.
The good folks at IPUMS have created a library to read in their data from the .gz file that you download. Be sure you right click and save the DDI codebook when you create your extract
This will save the xml file that contains all the information on the
data (what is contained in the data file) to your computer. When using
IPUMS, it will have a name like usa_xxxxx.xml where the x’s
represent the extract number (I’m on 92 as of today).
You will also need to download the data file, by right clicking the
Download.DAT link in the above image. This will save a
.gz file to your computer, again with a name like:
usa_xxxxx.dat.gz. Make sure this file and the xml file from
above are in the same folder, preferably your class folder.
Be sure the ipumsr package is installed.
library(ipumsr)
ddi <- read_ipums_ddi("C:/Users/ozd504//OneDrive - University of Texas at San Antonio//gis_classwork/usa_00083.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
data<-haven::zap_labels(data) #necessary to avoid problems with "labelled" data class
names(data)<-tolower(names(data))
library(survey, quietly = T)
## Warning: package 'Matrix' was built under R version 4.1.3
## Warning: package 'survival' was built under R version 4.1.3
library(tidyverse, quietly = T)
library(car, quietly = T)
library(ggplot2, quietly = T)
library(tigris, quietly = T)
library(classInt, quietly = T)
library(tmap, quietly = T)
The Public Use Microdata Area is the lowest level of geography in the PUMS data. They correspond to greographic ares of ~ 100,000 people.
options(tigris_class = "sf")
pumas<-pumas(state = "TX",
year = 2018,
cb = T)
##
|
| | 0%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 7%
|
|======= | 10%
|
|======== | 11%
|
|========= | 13%
|
|========== | 14%
|
|============ | 17%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 22%
|
|================== | 25%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 32%
|
|======================== | 35%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 39%
|
|============================= | 42%
|
|============================== | 43%
|
|================================ | 45%
|
|================================= | 46%
|
|=================================== | 50%
|
|===================================== | 54%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================== | 59%
|
|========================================== | 61%
|
|============================================ | 63%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|======================================================= | 78%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|===================================================================== | 99%
|
|======================================================================| 100%
plot(pumas["GEOID10"],
main = "Public Use Microdata Areas in Texas")
mapview::mapview(pumas, zcol= "GEOID10")
Here I recode several demographic variables
data$pwt <- data$perwt
data$hwt <- data$hhwt
#race/ethnicity
data$hisp <- Recode(data$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data$race_rec <- Recode(data$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data$race_eth <- interaction(data$hisp, data$race_rec, sep = "_")
data$race_eth <- as.factor(ifelse(substr(as.character(data$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data$race_eth)))
data$race_eth <- relevel(data$race_eth, ref = "NonHispanic_White")
#sex
data$male <- ifelse(data$sex == 1,1,0)
#education
data$educ_level<- Recode(data$educd, recodes = "2:61='0LT_HS';62:64='1_HSD/GED';65:80='2_somecoll';90:100='2_somecoll'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
#employment
data$employed <- Recode(data$wrklstwk, recodes = "1=0;2=1; else=NA")
#citizenship
data$cit<-Recode(data$citizen, recodes = "1='US born'; 2='naturalized'; 3:4='notcitizen';else=NA ")
#industry
data$ind_group<-Recode(data$ind, recodes = "170:490='ag_extract'; 770='construction'; 1070:3990='manufac'; 4070:5790='whole_retail'; 6070:6390='trans'; 6470:6780='information'; 6870:7190= 'fire'; 7270=7790='prof/sci/manage'; 7860:8470='edu/social'; 8560:8690='arts'; 8770:9290='other'; 9370:9590='public_adm'; 9670:9870='military'; else=NA ")
data$proftech <- Recode(data$ind, recodes = "7270:7490=1; 0=NA; else=0")
#age in 10 year intervals
data$agecat<-cut(data$age, breaks = c(0, 18, 20, 30, 40, 50, 65, 120), include.lowest = T)
data$income <- ifelse(data$incwage>=999998, NA, data$incwage)
Here we identify the person weights and the survey design variables.
des<-svydesign(ids = ~cluster,
strata = ~ strata,
weights = ~pwt,
data = data)
The svyby() function allows us calculate estimates for
different sub-domains within the data, this could be a
demographic characteristic, but we’ll use our geographic level. Of
course you could do both….
test<-svytable(~I(cit=="US born")+puma+sex, design=des )
puma_est_edu<-svyby(formula = ~educ_level,
by = ~puma,
design = subset(des, age>25),
FUN=svymean,
na.rm = TRUE )
puma_est_employ<-svyby(formula = ~employed,
by = ~puma,
design=subset(des, age %in% 18:65),
FUN=svymean,
na.rm = TRUE )
puma_est_industry<-svyby(formula = ~proftech,
by = ~puma+sex+race_eth,
design = subset(des, employed==1),
FUN = svymean,
na.rm = TRUE )
library(reldist)
## Warning: package 'reldist' was built under R version 4.1.3
## reldist: Relative Distribution Methods
## Version 1.7-0 created on 2021-11-10.
## copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
## For citation information, type citation("reldist").
## Type help(package="reldist") to get started.
gini.puma<-data%>%
filter(income >0, is.na(income)==F)%>%
group_by( puma, race_eth)%>%
summarize(ineq = gini(income, weights = pwt))%>%
ungroup()
## `summarise()` has grouped output by 'puma'. You can override using the
## `.groups` argument.
head(puma_est_edu)
head(puma_est_employ)
head(puma_est_industry)
pumas$puma<-as.numeric(pumas$PUMACE10)
geo1<-left_join(pumas, puma_est_employ, by=c("puma"= "puma"))
head(geo1)
geo2<-left_join(pumas, puma_est_industry, by=c("puma"= "puma"))
head(geo2)
geo3<-left_join(pumas, gini.puma, by=c("puma"= "puma"))
head(geo2)
tmap_mode("view")
## tmap mode set to interactive viewing
geo1%>%
tm_shape()+
tm_polygons("employed",
style="kmeans",
n=8,
legend.hist = TRUE) +
tm_layout(legend.outside = TRUE,
title = "Employment rate in Texas PUMAs \n 2014-2018")
#tmap_mode("plot")
geo2%>%
tm_shape()+
tm_polygons("proftech",
style="kmeans",
n=8,
legend.hist = TRUE) +
tm_layout(legend.outside = TRUE,
title = "Employment rate in Texas PUMAs \n 2014-2018")